Introduction to R
for Social Scientists

Workshop Day 2A | 2022-07-26
Jeffrey M. Girard | Pitt Methods

Wrangle II

Basic wrangling verbs

  • tidyverse provides tools for wrangling tibbles
    • These functions are named after verbs
    • So if you name your objects after nouns
    • …your code becomes easier to read
Noun(noun) ❌ Verb(noun) ✔️
blender(fruit) blend(fruit)
screwdriver(screw) drive(screw)
boxcutter(box) cut(box)

Column-focused verbs

  • Select retains only certain columns/variables
    • select(TBL, VAR_KEEP, -VAR_DROP)
  • Mutate adds or transforms columns/variables
    • mutate(TBL, NEW_VAR = OLD_VAR / 1000)
  • Rename changes the names of columns/variables
    • rename(TBL, NEW_NAME = OLD_NAME)
  • Relocate changes the order of columns/variables
    • relocate(TBL, VAR_MOVE, .after = OTHER_VAR)

Select Live Coding

# SETUP: Load package and inspect example tibble

library(tidyverse) # includes the dplyr package
starwars

# ==============================================================================

# USECASE: Retain only the specified variables

sw <- select(starwars, name)
sw
sw <- select(starwars, name, sex, species)
sw

# ==============================================================================

# PITFALL: Don't forget to save the change with assignment

select(starwars, name, sex, species)
starwars # still includes all variables

# ==============================================================================

# USECASE: Retain all variables between two variables

sw <- select(starwars, name, hair_color:eye_color)
sw

# ==============================================================================

# USECASE: Retain all variables except the specified ones

sw <- select(starwars, -sex, -species)
sw
sw <- select(starwars, -c(sex, species))
sw
sw <- select(starwars, -c(hair_color:starships))
sw

Mutate Live Coding

# SETUP: Create example tibble

patients <- tibble(
  id = c("S1", "S2", "S3"),
  feet = c(6, 5, 5),
  inches = c(1, 7, 2),
  pounds = c(176.3, 124.9, 162.6)
)
patients

# ==============================================================================

# USECASE: Add one or more variables

p2 <- mutate(patients, sex = c("M", "F", "F"))
p2

ages <- c(32, 41, 29)
p2 <- mutate(patients, ages = ages)
p2

p2 <- mutate(
  patients, 
  sex = c("M", "F", "F"), 
  ages = ages
)
p2

# ==============================================================================

# USECASE: Compute variables

p2 <- mutate(patients, height = feet + inches / 12)
p2

p2 <- mutate(patients, ln_pounds = log(pounds))
p2

# ==============================================================================

# USECASE: Overwrite variables

patients <- mutate(patients, height = height / 3.281)
patients

# ==============================================================================

# USECASE: Duplicate variables

p2 <- mutate(patients, weight = pounds)
p2 # both weight and pounds exist

Rename / Relocate Live Coding

# USECASE: Change the name of one or more variables

starwars

sw <- rename(starwars, Character = name)
sw

sw <- rename(starwars, height_cm = height, mass_kg = mass)
sw

# ==============================================================================

# PITFALL: Don't swap the order and try old_name = new_name

sw <- rename(starwars, name = Character) # error

# ==============================================================================

# USECASE: Move variables before or after another variable

starwars

sw <- relocate(starwars, species, sex, .before = height)
sw

sw <- relocate(starwars, species, sex, .after = name)
sw

# ==============================================================================

# PITFALL: Don't forget the period!

sw <- relocate(starwars, sex, before = height) 
sw # height was accidentally renamed to before

Row-focused verbs

  • Arrange sorts rows based on their values
    • arrange(TBL, VAR_SORT_UP)
    • arrange(TBL, desc(VAR_SORT_DOWN))
    • arrange(TBL, VAR_SORT_1ST, VAR_SORT_2ND)
  • Filter retains certain rows based on criteria
    • filter(TBL, DBL_CRIT > 0)
    • filter(TBL, STR_CRIT == "A")
    • filter(TBL, CRIT1, CRIT2)

Arrange Live Coding

# USECASE: Sort observations by a variable

starwars

sw <- arrange(starwars, height)
sw # sorted by height, ascending

sw <- arrange(starwars, name)
sw # sorted by name, alphabetically

# ==============================================================================

# USECASE: Sort observations by a variable, in reverse order

sw <- arrange(starwars, desc(height))
sw # sorted by height, descending

sw <- arrange(starwars, desc(name))
sw # sorted by name, reverse-alphabetically

# ==============================================================================

# USECASE: Sort observations by multiple variables

sw <- arrange(starwars, hair_color, mass)
sw # sorted by hair_color, then ties broken by mass

Filter Live Coding

# USECASE: Retain only observations that meet a criterion

sw <- filter(starwars, mass > 100)
sw # only observations with mass greater than 100

sw <- filter(starwars, mass <= 100)
sw # only observations with mass less than or equal to 100

sw <- filter(starwars, species == "Human")
sw # only observations with species equal to Human

sw <- filter(starwars, species != "Human")
sw # only observations with species not equal to Human

# ==============================================================================

# PITFALL: Don't try to use a single = for testing equality

sw <- filter(starwars, height = 150) # error

sw <- filter(starwars, height == 150) # correct
sw 

# ==============================================================================

# PITFALL: Don't forget that R is case-sensitive

sw <- filter(starwars, species == "human")
sw # no observations left (because it should be Human)

# ==============================================================================

# USECASE: Retain only observations that meet complex criteria

sw <- filter(starwars, mass > 100 & height > 200)
sw # only observations with mass over 100 AND height over 200

sw <- filter(starwars, height < 100 | hair_color == "none")
sw # only observations with height under 100 OR hair_color equal to none

# ==============================================================================

# PITFALL: Don't forget to complete both conditions

sw <- filter(starwars, mass > 100 & < 200) # error

sw <- filter(starwars, mass > 100 & mass < 200) # correct
sw

# ==============================================================================

# PITFALL: Don't try to equate a string to a vector

sw <- filter(starwars, species == c("Human", "Droid")) # error

sw <- filter(starwars, species %in% c("Human", "Droid")) # correct
sw

Filter Cheatsheet

Symbol Description Num Chr
< Less than Yes No
<= Less than or equal to Yes No
> More than Yes No
>= More than or equal to Yes No
== Equal to Yes Yes
!= Not equal to Yes Yes
%in% Found in Yes Yes
& Logical And Yes Yes
| Logical Or Yes Yes

Program III

Pipes and Pipelines

  • How can we do multiple operations to an object?
    1. x <- 10
    2. x2 <- sqrt(x)
    3. x3 <- round(x2)
  • This works but is cumbersome and error-prone
  • A better approach is to use pipes and pipelines
    • x3 <- 10 |> sqrt() |> round()
  • I like to read |> as “and then…”
    • “Take 10 and then sqrt() and then round()”

Pipes Live Coding

# SETUP: Enable the pipe operator shortcut

# Tools > Global Options... > Code tab > Check "Use Native Pipe Operator"

# Type out |> or press Ctrl+Shift+M (Windows) / Cmd+Shift+M (Mac)

# ==============================================================================

# LESSON: The pipe pushes objects to a function as its first argument

# TEMPLATE: x |> function_name() is the same as function_name(x)

x <- 10

y <- sqrt(x)
y

y <- x |> sqrt()
y

# ==============================================================================

# PITFALL: Don't forget to remove the object from the function call

x |> sqrt(x) # wrong

x |> sqrt() # correct

# ==============================================================================

# USECASE: You can still use arguments when piping

z <- round(3.14, digits = 1)
z

z <- 3.14 |> round(digits = 1)
z

# ==============================================================================

# USECASE: Pipes are useful with tibbles and wrangling verbs

starwars

sw <- select(starwars, name, species, height)
sw

sw <- starwars |> select(name, species, height)
sw

# ==============================================================================

# PITFALL: Don't add a pipe without a step after it

sw <- starwars |> select(name, species, height) |> # error

Pipelines Live Coding

# USECASE: You can chain multiple pipes together to make a pipeline

x <- 10 |> sqrt() |> round()
x

# ==============================================================================

# TIP: If you want to see the output of a pipeline, you can pipe to print()

x <- 10 |> sqrt() |> round() |> print()

# ==============================================================================

# TIP: To make your pipelines more readable, move each step to a new line

x <- 
  10 |> 
  sqrt() |> 
  round() |>
  print()

# ==============================================================================

# PITFALL: Don't put the pipe at the beginning of a line, though

x <- 
  10 
  |> sqrt()
  |> round()
  |> print() # error

# ==============================================================================

# USECASE: Chain together a series of verbs to flexibly wrangle data

tallones <- 
  starwars |> 
  select(name, species, height) |> 
  rename(height_cm = height) |> 
  mutate(height_ft = height_cm / 30.48) |>  
  filter(height_ft > 7) |> 
  arrange(desc(height_ft)) |>  
  print()

Factors

  • Factors are used to represent categorical data
    • Factors have multiple possible levels
    • Levels are discrete and mutually-exclusive
  • Sometimes categories are unordered (nominal)
    • Action or Comedy or Drama
    • Asia or Europe or North America
  • Sometimes categories are ordered (ordinal)
    • Mild < Medium < Hot
    • XS < S < M < L < XL

Factors Live Coding

# USECASE: Ask 10 kids to order 1: nuggets, 2: pizza, or 3: salad

food <- c(2, 2, 1, 2, 1, 2, 1, 1, 2, 2)
food

# ==============================================================================

# LESSON: We can turn this vector into a factor with factor()

food2 <- factor(food, levels = c(1, 2, 3))
food2

food3 <- factor(food, levels = c(1, 2, 3),
                labels = c("nuggets", "pizza", "salad"))
food3

# ==============================================================================

# USECASE: We can also quickly and easily count each level with table()

table(food3)

# ==============================================================================

# PITFALL: Don't confuse levels and labels

food4 <- factor(food, labels = c(1, 2, 3),
                levels = c("nuggets", "pizza", "salad"))
food4 # full of <NA> because it can't find these levels

# ==============================================================================

# USECASE: You can also just enter strings directly (as self-labels)

genre <- c("pop", "metal", "pop", "rock", "rap", "rap", "pop", "rock")
genre

genre2 <- factor(genre) # observed levels will be assigned alphabetically
genre2

table(genre2)

# ==============================================================================

# LESSON: If ordinal, enter levels low-to-high and add ordered = TRUE

salsa <- c("hot", "mild", "medium", "mild", "medium", "medium")

salsa2 <- factor(salsa, 
                 levels = c("mild", "medium", "hot"), 
                 ordered = TRUE)
salsa2 

# NOTE: We may want to visualize or model ordinal factors differently

# ==============================================================================

# USECASE: Working with factors in a tibble

cereal <- read_csv("cereal.csv")
cereal

cereal2 <- mutate(cereal, mfr = factor(mfr), type = factor(type))
cereal2

table(cereal2$mfr)

table(cereal2$type)

Missing Values

  • Sometimes your data will have missing values
    • Perhaps these were never collected
    • Perhaps the values were lost/corrupted
    • Perhaps the participant didn’t respond
  • We need to tell R which values are missing
    • To do so, we set those values to NA
    • Functions from tidyverse make this easy
  • Missingness is often “contagious” in R
    e.g., a vector with NA has an unknown mean

Missing Values Live Coding

# SETUP: We will need tidyverse for the read and mutate functions

library(tidyverse)

# ==============================================================================

# PITFALL: Number codes for missingness will mess up calculations in R

heights <- c(149, 158, -999) # here we use -999 to represent a missing value

range(heights)

mean(heights)

log(heights) # our missing value is no longer -999

# ==============================================================================

# USECASE: Use NA for missingness instead

heights2 <- c(149, 158, NA)
heights2

log(heights2) # the NA stayed an NA (due to contagiousness)

# ==============================================================================

# LESSON: Use na.rm = TRUE to do a summary function ignoring the NAs

mean(heights2) # the mean is an NA (due to contagiousness)

mean(heights2, na.rm = TRUE)

range(heights2, na.rm = TRUE)

# ==============================================================================

# USECASE: Dealing with missing values in tibbles

cereal <- read_csv("cereal.csv")

cereal$rating

range(cereal$rating)

# ==============================================================================

# LESSON: Use na_if() to convert specific values to NA while mutating

cereal2 <- mutate(cereal, rating = na_if(rating, -999))

cereal2$rating

range(cereal2$rating, na.rm = TRUE)

# ==============================================================================

# LESSON: Use read_csv(na) to convert specific values to NA while reading

cereal3 <- read_csv("cereal.csv", na = "-999")

cereal3$rating

range(cereal3$rating, na.rm = TRUE)

Practice III